# Connect to Guardian API
library(guardianapi)
gu_api_key()
# Get data include key words "police" OR "law enforcement"
data_all <-gu_content(query='"police" OR "law enforcement"',
from_date="2021-01-01",
to_date="2023-03-31")
# Check the data
dim(data_all)
head(data_all)
# 24863 news articles collected
# Limit the data to only section_name == 'UK news'
data_uk_sec <- subset(data_all, section_name == 'UK news')
dim(data_uk_sec)
head(data_uk_sec)
# Save as CSV file
library(dplyr)
my_data <- data_uk_sec %>%
select(-tags) # remove the non-informative column that contains nested lists
my_df <- as.data.frame(my_data)
write.csv(my_df, "raw_data.csv", row.names = TRUE)
# Read collected data
my_data <- read.csv('raw_data.csv')
dim(my_data)
[1] 3259 47
# Load required packages
library(tidyr)
library(quanteda)
library(caret)
library(dplyr)
library(tidytext)
library(syuzhet)
library(tidyverse)
library(quanteda.textstats)
library(ggplot2)
# Pre-processing
# Select required columns
my_data_hbd <- my_data %>%
select(headline, body_text, first_publication_date)
# Remove duplicates
my_data_hbd <- distinct(my_data_hbd)
# Remove noise
my_data_hbd$body_text <- gsub("\\W+", " ", my_data_hbd$body_text)
# Tokenise the data and remove punctuation, numbers, and symbols
# Convert all text to lowercase and apply stemming
my_data_tokens <- tokens(my_data_hbd$body_text, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords("english")) %>% tokens_tolower() %>% tokens_wordstem(language = quanteda_options("language_stemmer"))
dim(my_data_hbd)# 3194 UK news articles after pre-processing
[1] 3194 3
head(my_data_hbd)
## Bing lexicon
# Try to use "bing" lexicon to get sentiment score for each news article
# Convert the data to a tidy format
tokens <- my_data_hbd %>%
unnest_tokens(word, body_text) %>%
anti_join(stop_words)
Joining with `by = join_by(word)`
# Load the lexicon
lexicon <- get_sentiments("bing")
# Join the lexicon with data
my_data_sentiment <- tokens %>%
inner_join(lexicon)
Joining with `by = join_by(word)`
# Calculate the sentiment score
my_data_sentiment_scores <- my_data_sentiment %>%
count(headline, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment_score = positive - negative)
# Visualise the sentiment scores
ggplot(my_data_sentiment_scores, aes(x = sentiment_score)) +
geom_histogram(binwidth = 2, color = "black", fill = "lightblue") +
labs(title = "Distribution of Sentiment Scores",
x = "Sentiment Score",
y = "Frequency") +
theme_minimal()

# Categorise sentiment scores as positive, negative, or neutral
my_data_sentiment_category <- my_data_sentiment_scores %>%
mutate(sentiment_category = ifelse(sentiment_score > 0, "Positive",
ifelse(sentiment_score < 0, "Negative", "Neutral")))
# Count the number of articles in each sentiment category
my_data_sentiment_counts <- my_data_sentiment_category %>%
count(sentiment_category)
print(my_data_sentiment_counts)
# Visualise the distribution of sentiment categories in articles
ggplot(data = my_data_sentiment_counts, aes(x = sentiment_category, y = n, fill = sentiment_category)) +
geom_col() +
geom_text(aes(label = n), position = position_stack(vjust = 0.5), color = "black", size = 4) +
labs(x = "Sentiment Category", y = "Number of Articles", fill = "Sentiment Category") +
ggtitle("Sentiment Categories Proportions Based on Bing Lexicon")

# Try to explore sentiment change over time based on Bing lexicon
# Calculate the sentiment score by date
sentiment_scores_date <- my_data_sentiment %>%
count(first_publication_date, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment_score = positive - negative)
# Categorise sentiment scores as positive, negative, or neutral
sentiment_category_date <- sentiment_scores_date %>%
mutate(sentiment_category = ifelse(sentiment_score > 0, "Positive",
ifelse(sentiment_score < 0, "Negative", "Neutral")))
# Convert date to date object
sentiment_category_date$first_publication_date <- as.Date(sentiment_category_date$first_publication_date)
# Visualise the sentiment scores over time
ggplot(sentiment_category_date, aes(x = first_publication_date, y = sentiment_score)) +
geom_line(color = "#000080") +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Sentiment Score Over Time Based on Bing Lexicon",
x = "Date",
y = "Sentiment Score") +
theme_minimal()

## Use get_sentiment() from syuzhet package to get sentiment score for each news article
library(syuzhet)
# Calculate the sentiment scores
my_data_hbd$sentiment_score <- get_sentiment(my_data_hbd$body_text)
# Categorise sentiment scores as positive, negative, or neutral
my_data_hbd$sentiment_category <- cut(my_data_hbd$sentiment_score, breaks=c(-Inf,-0.1,0.1,Inf), labels=c("Negative", "Neutral", "Positive"))
# Visualise the distribution of sentiment scores
ggplot(my_data_hbd, aes(x=sentiment_score, fill=sentiment_category)) +
geom_histogram(binwidth=0.05, alpha=0.5, position="identity") +
labs(title="Sentiment Scores Distribution", x="Sentiment Score", y="Count") +
theme_minimal()

# Visualise the proportion of positive, negative, and neutral sentiment categories
ggplot(my_data_hbd, aes(x = sentiment_category, fill = sentiment_category)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)),
position = position_stack(vjust = 0.5)) +
labs(title = "Sentiment Categories Proportions Based on Syuzhet Package",
x = "Sentiment Category", y = "Number of Articles") +
theme_minimal() +
theme(panel.background = element_rect(fill = "grey90", color = NA),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_blank())

# Try to explore sentiment change over time based on syuzhet package
# Convert first_publication_date to a date object
my_data_hbd$first_publication_date <- as.Date(my_data_hbd$first_publication_date)
# Aggregate sentiment score by date
sentiment_by_date <- my_data_hbd %>%
group_by(first_publication_date) %>%
summarise(sentiment_score = mean(sentiment_score))
# Create a line chart of sentiment score over time
ggplot(sentiment_by_date, aes(x=first_publication_date, y=sentiment_score)) +
geom_line(color = "#000080") +
geom_hline(yintercept = 0, color = "red") +
labs(title="Sentiment Score over Time Based on Syuzhet Package", x="Date", y="Sentiment Score") +
theme_minimal()

# Plot the sentiment score over time together (Bing Lexicon and Syuzhet Package)
library(patchwork)
library(scales)
# Create the first plot
plota <- ggplot(sentiment_category_date, aes(x = first_publication_date, y = sentiment_score)) +
geom_line(color = "#000080") +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Sentiment Score Over Time Based on Bing Lexicon",
x = "Date",
y = "Sentiment Score") +
theme_minimal() +
scale_x_date(date_breaks = "2 months", date_labels = "%b %y")
# Create the second plot
plotb <- ggplot(sentiment_by_date, aes(x=first_publication_date, y=sentiment_score)) +
geom_line(color = "#000080") +
geom_hline(yintercept = 0, color = "red") +
labs(title="Sentiment Score over Time Based on Syuzhet Package", x="Date", y="Sentiment Score") +
theme_minimal() +
scale_x_date(date_breaks = "2 months", date_labels = "%b %y")
# Combine the plots vertically using patchwork
plota / plotb

## Supervised Machine Learning
# Try to apply machine learning technique, build a model to predict sentiment
# Map the values of the original column "sentiment_category" to integers
my_data_ml <- my_data_hbd %>%
mutate(id = row_number(), # add a new column with row numbers
sentiment_category = case_when(
sentiment_category == "Positive" ~ 2,
sentiment_category == "Negative" ~ 1,
sentiment_category == "Neutral" ~ 0,
TRUE ~ NA_real_ # to handle any other cases
))
# Extract unigrams, bigrams as features
unigrams <- my_data_ml$body_text %>%
tokens(remove_punct = T) %>%
tokens_ngrams(1) %>%
dfm()
bigrams <- my_data_ml$body_text %>%
tokens(remove_punct = T) %>%
tokens_ngrams(2) %>%
dfm()
# Apply sparsity correction to reduce the zero counts
unigrams_corrected <- dfm_trim(unigrams, sparsity = .90)
bigrams_corrected <- dfm_trim(bigrams, sparsity = .90)
# Check dimensions
dim(unigrams_corrected)
[1] 3194 622
# TFIDF-weighted ngrams
unigrams_tfidf <- dfm_tfidf(unigrams_corrected,
scheme_tf = 'count',
scheme_df = 'inverse')
bigrams_tfidf <- dfm_tfidf(bigrams_corrected,
scheme_tf = 'count',
scheme_df = 'inverse')
# Convert ngrams to data frames
news_unigrams <- convert(unigrams_tfidf, 'data.frame')
news_bigrams <- convert(bigrams_tfidf, 'data.frame')
# Add id column to ngrams to join unigrams and bigrams later
news_unigrams$id <- my_data_ml$id
news_bigrams$id <- my_data_ml$id
# Remove doc_id column
unigrams_tfidf_df <- subset(news_unigrams, select = -c(doc_id))
bigrams_tfidf_df <- subset(news_bigrams, select = -c(doc_id))
# Join the two datasets
ml_data <- bigrams_tfidf_df %>% left_join(unigrams_tfidf_df, by = "id")
# Get sentiment scores as features
ml_data$sentiment_score <- get_sentiment(my_data_ml$body_text)
# Add label column
ml_data$label <- my_data_ml$sentiment_category
# Remove the id column
ml_data <- ml_data %>% select(-id)
# Map labels
ml_data$label <- ifelse(ml_data$label == "2", "Positive",
ifelse(ml_data$label == "1", "Negative", "Neutral"))
# Convert label to factor
ml_data$label <- as.factor(ml_data$label) # convert label to factor
# Apply machine learning setting
library(caret)
set.seed(123)
for_training <- createDataPartition(y = ml_data$label,
p = .8,
list = FALSE)
training_data <- ml_data[for_training,]
testing_data <- ml_data[-for_training,]
# Set training control parameters
training_controls <- trainControl(method="cv",
number = 5,
classProbs = T)
## Train the SVM model
model.svm <- train(label ~ .,
data = training_data,
method = "svmLinear",
trControl = training_controls,
preProcess = c('zv')
)
# Predict on the testing data
pred_svm <- predict(model.svm, testing_data)
# Evaluate the model
confusionMatrix(pred_svm, as.factor(testing_data$label), mode="prec_recall")
Confusion Matrix and Statistics
Reference
Prediction Negative Neutral Positive
Negative 431 5 42
Neutral 0 0 0
Positive 21 1 137
Overall Statistics
Accuracy : 0.8917
95% CI : (0.8649, 0.9147)
No Information Rate : 0.7096
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7274
Mcnemar's Test P-Value : 0.004637
Statistics by Class:
Class: Negative Class: Neutral Class: Positive
Precision 0.9017 NA 0.8616
Recall 0.9535 0.000000 0.7654
F1 0.9269 NA 0.8107
Prevalence 0.7096 0.009419 0.2810
Detection Rate 0.6766 0.000000 0.2151
Detection Prevalence 0.7504 0.000000 0.2496
Balanced Accuracy 0.8497 0.500000 0.8587
## Train the Naive Bayes model
model.nb <- train(label ~ .,
data = training_data,
method = "naive_bayes",
trControl = training_controls,
preProcess = c('zv'))
# Predict on the testing data
pred_nb <- predict(model.nb, testing_data)
# Evaluate the model
confusionMatrix(pred_nb, as.factor(testing_data$label), mode="prec_recall")
Confusion Matrix and Statistics
Reference
Prediction Negative Neutral Positive
Negative 451 6 174
Neutral 1 0 5
Positive 0 0 0
Overall Statistics
Accuracy : 0.708
95% CI : (0.671, 0.7431)
No Information Rate : 0.7096
P-Value [Acc > NIR] : 0.5544
Kappa : 0.0169
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: Negative Class: Neutral Class: Positive
Precision 0.7147 0.000000 NA
Recall 0.9978 0.000000 0.000
F1 0.8329 NaN NA
Prevalence 0.7096 0.009419 0.281
Detection Rate 0.7080 0.000000 0.000
Detection Prevalence 0.9906 0.009419 0.000
Balanced Accuracy 0.5124 0.495246 0.500
# Set training control parameters
training_control <- trainControl(method = "cv",
number = 5,
verboseIter = FALSE)
# Train the Neural Network (NN) model
model.nn <- train(label ~ .,
data = training_data,
trControl = training_control,
method = "nnet",
MaxNWts = 10000,
maxit = 10)
# weights: 937
initial value 3012.317783
iter 10 value 215.740063
final value 215.740063
stopped after 10 iterations
# weights: 2805
initial value 2890.927954
iter 10 value 528.905183
final value 528.905183
stopped after 10 iterations
# weights: 4673
initial value 1631.311019
iter 10 value 81.493961
final value 81.493961
stopped after 10 iterations
# weights: 937
initial value 2205.419836
iter 10 value 375.374810
final value 375.374810
stopped after 10 iterations
# weights: 2805
initial value 4080.726308
iter 10 value 235.414024
final value 235.414024
stopped after 10 iterations
# weights: 4673
initial value 2097.277411
iter 10 value 441.662316
final value 441.662316
stopped after 10 iterations
# weights: 937
initial value 1797.509188
iter 10 value 454.281650
final value 454.281650
stopped after 10 iterations
# weights: 2805
initial value 1522.231448
iter 10 value 76.772175
final value 76.772175
stopped after 10 iterations
# weights: 4673
initial value 2790.914079
iter 10 value 149.951239
final value 149.951239
stopped after 10 iterations
# weights: 937
initial value 2703.766546
iter 10 value 231.052442
final value 231.052442
stopped after 10 iterations
# weights: 2805
initial value 4350.443913
iter 10 value 175.537187
final value 175.537187
stopped after 10 iterations
# weights: 4673
initial value 1915.247997
iter 10 value 67.946871
final value 67.946871
stopped after 10 iterations
# weights: 937
initial value 2225.431471
iter 10 value 371.623557
final value 371.623557
stopped after 10 iterations
# weights: 2805
initial value 3346.243383
iter 10 value 530.165846
final value 530.165846
stopped after 10 iterations
# weights: 4673
initial value 2232.140065
iter 10 value 860.015426
final value 860.015426
stopped after 10 iterations
# weights: 937
initial value 2453.714672
iter 10 value 459.896268
final value 459.896268
stopped after 10 iterations
# weights: 2805
initial value 2948.106984
iter 10 value 108.793768
final value 108.793768
stopped after 10 iterations
# weights: 4673
initial value 3258.825327
iter 10 value 93.066605
final value 93.066605
stopped after 10 iterations
# weights: 937
initial value 1687.055584
iter 10 value 175.437276
final value 175.437276
stopped after 10 iterations
# weights: 2805
initial value 2275.372509
iter 10 value 501.559418
final value 501.559418
stopped after 10 iterations
# weights: 4673
initial value 2893.730150
iter 10 value 83.585677
final value 83.585677
stopped after 10 iterations
# weights: 937
initial value 2687.798446
iter 10 value 280.086403
final value 280.086403
stopped after 10 iterations
# weights: 2805
initial value 2445.555162
iter 10 value 162.961999
final value 162.961999
stopped after 10 iterations
# weights: 4673
initial value 2875.294472
iter 10 value 172.413339
final value 172.413339
stopped after 10 iterations
# weights: 937
initial value 2650.707482
iter 10 value 549.678533
final value 549.678533
stopped after 10 iterations
# weights: 2805
initial value 2829.549362
iter 10 value 200.365957
final value 200.365957
stopped after 10 iterations
# weights: 4673
initial value 2421.146339
iter 10 value 79.565507
final value 79.565507
stopped after 10 iterations
# weights: 937
initial value 2015.110133
iter 10 value 184.705076
final value 184.705076
stopped after 10 iterations
# weights: 2805
initial value 2169.741973
iter 10 value 194.031525
final value 194.031525
stopped after 10 iterations
# weights: 4673
initial value 2027.832382
iter 10 value 84.826725
final value 84.826725
stopped after 10 iterations
# weights: 937
initial value 2035.805987
iter 10 value 361.348988
final value 361.348988
stopped after 10 iterations
# weights: 2805
initial value 3230.270754
iter 10 value 281.020927
final value 281.020927
stopped after 10 iterations
# weights: 4673
initial value 1598.373160
iter 10 value 258.396820
final value 258.396820
stopped after 10 iterations
# weights: 937
initial value 1867.947297
iter 10 value 235.219160
final value 235.219160
stopped after 10 iterations
# weights: 2805
initial value 3175.434933
iter 10 value 115.798166
final value 115.798166
stopped after 10 iterations
# weights: 4673
initial value 3139.111259
iter 10 value 125.734630
final value 125.734630
stopped after 10 iterations
# weights: 937
initial value 1794.762865
iter 10 value 402.371864
final value 402.371864
stopped after 10 iterations
# weights: 2805
initial value 2308.690976
iter 10 value 127.198993
final value 127.198993
stopped after 10 iterations
# weights: 4673
initial value 3587.218789
iter 10 value 297.734430
final value 297.734430
stopped after 10 iterations
# weights: 937
initial value 2855.653728
iter 10 value 439.341461
final value 439.341461
stopped after 10 iterations
# weights: 2805
initial value 3714.714161
iter 10 value 227.465379
final value 227.465379
stopped after 10 iterations
# weights: 4673
initial value 2852.131228
iter 10 value 134.047050
final value 134.047050
stopped after 10 iterations
# weights: 937
initial value 2740.447362
iter 10 value 882.667978
final value 882.667978
stopped after 10 iterations
# weights: 2805
initial value 3239.106458
iter 10 value 201.950394
final value 201.950394
stopped after 10 iterations
# weights: 4673
initial value 2086.209728
iter 10 value 61.603887
final value 61.603887
stopped after 10 iterations
# weights: 2805
initial value 2785.648012
iter 10 value 577.050423
final value 577.050423
stopped after 10 iterations
# Predict on the testing data
pred_nn <- predict(model.nn, testing_data)
# Evaluate the model
confusionMatrix(pred_nn, testing_data$label, mode = "prec_recall")
Confusion Matrix and Statistics
Reference
Prediction Negative Neutral Positive
Negative 447 1 2
Neutral 0 0 0
Positive 5 5 177
Overall Statistics
Accuracy : 0.9796
95% CI : (0.9654, 0.9891)
No Information Rate : 0.7096
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.951
Mcnemar's Test P-Value : 0.06333
Statistics by Class:
Class: Negative Class: Neutral Class: Positive
Precision 0.9933 NA 0.9465
Recall 0.9889 0.000000 0.9888
F1 0.9911 NA 0.9672
Prevalence 0.7096 0.009419 0.2810
Detection Rate 0.7017 0.000000 0.2779
Detection Prevalence 0.7064 0.000000 0.2936
Balanced Accuracy 0.9864 0.500000 0.9835
## Unsupervised Machine Learning
# Try to apply k-means clustering model using the extracted features
# Use ngrams frequencies as features
ml_grams <- bigrams_tfidf_df %>% left_join(unigrams_tfidf_df, by = "id")
# Remove the id column
ml_grams <- ml_grams %>% select(-id)
# Use the k-means model on n-grams
# Determine k
wss <- numeric()
for(i in 1:10){
ngrams_kmeans_temp <- kmeans(x = ml_grams,
centers = i,
iter.max = 20,
nstart = 10)
wss[i] <- ngrams_kmeans_temp$tot.withinss
}
{plot(wss, type='b')}

# Settle for k=2
ngrams_kmeans <- kmeans(ml_grams,
centers = 2,
iter.max = 10,
nstart = 10)
# Look at the clusters by their most prominent ngram differences
# Cluster 1
cat("Cluster 1: \n")
Cluster 1:
print(head(sort(ngrams_kmeans$centers[1, ], decreasing = TRUE), n = 20))
royal her state will will_be
53.78888 31.04378 29.11604 23.43286 22.49379
prime prime_minister minister she here
22.42150 19.98957 19.88383 18.28677 15.88477
service world members_of wales new
15.34549 14.74088 13.74756 13.56348 13.49116
death i country today members
13.31633 13.06975 12.94854 12.84766 12.54317
# Cluster 2
cat("\nCluster 2: \n")
Cluster 2:
print(head(sort(ngrams_kmeans$centers[2, ], decreasing = TRUE), n = 20))
her she i his met he the_met
0.8976625 0.8137124 0.6721067 0.5938533 0.5798295 0.5744155 0.5516751
women you officers him court people we
0.5282233 0.4786335 0.4515644 0.4432352 0.4309058 0.4273825 0.4235406
family t government he_was uk london
0.4209670 0.4143618 0.4004978 0.3901206 0.3864412 0.3794096
# Use the k-means model on sentiment score
# Determine k
wss <- numeric()
for(i in 1:10){
sentiment_kmeans_temp <- kmeans(x = ml_data$sentiment_score,
centers = i,
iter.max = 10,
nstart = 10)
wss[i] <- sentiment_kmeans_temp$tot.withinss
}
{plot(wss, type='b')}

# Settle for k=3
sentiment_kmeans <- kmeans(ml_data$sentiment_score,
centers = 3,
iter.max = 5,
nstart = 5)
# Print the centers of each sentiment cluster
print(sentiment_kmeans$centers[1, ])
1
1.58212
print(sentiment_kmeans$centers[2, ])
2
-10.96622
print(sentiment_kmeans$centers[3, ])
3
99.87857
# Visualise the clusters
# Add the cluster labels to the original data frame
ml_data$cluster <- factor(sentiment_kmeans$cluster)
# Create the boxplot
ggplot(ml_data, aes(x = cluster, y = sentiment_score)) +
geom_boxplot() +
xlab("Cluster") +
ylab("Sentiment Score")

### Topic Modelling
# Load required packages
library(tidyr)
library(quanteda)
library(caret)
library(dplyr)
library(tidytext)
library(topicmodels)
library(ldatuning)
library(stm)
library(wordcloud)
library(tm)
library(ggplot2)
# Prepare the data and select required columns
my_data_hbd <- my_data %>%
select(headline, body_text, first_publication_date)
# Remove duplicates
my_data_hbd <- distinct(my_data_hbd)
# Remove any noise in the data
my_data_hbd$body_text <- gsub("\\W+", " ", my_data_hbd$body_text)
# Tokenise the data and remove punctuation, numbers, and symbols
# Convert all text to lowercase, apply stemming, and remove non-informative words for topic modelling purposes
tm_tokens <- tokens(my_data_hbd$body_text, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords("english")) %>% tokens_tolower() %>% tokens_wordstem(language = quanteda_options("language_stemmer")) %>% tokens_remove(c("polic", "offic", "peopl", "year", "day", "will", "also", "t", "s", "can", "uk", "british"))
# Create a Document Feature Matrix (dfm)
dfm_tm <- dfm(tm_tokens)
# Trim the dfm for frequently and rarely occurring terms
dfm.trim <- dfm_trim(dfm_tm, min_docfreq = 0.075, max_docfreq = 0.9, docfreq_type = "prop")
dfm.trim
Document-feature matrix of: 3,194 documents, 764 features (83.44% sparse) and 0 docvars.
features
docs immedi govern quit human right issu stop small cross across
text1 2 1 1 6 8 1 1 1 2 1
text2 0 3 0 0 0 1 1 0 0 2
text3 0 4 1 2 1 0 1 0 0 0
text4 0 2 0 0 0 1 0 3 1 2
text5 0 1 0 0 1 2 0 0 0 0
text6 0 0 0 0 0 0 11 0 0 2
[ reached max_ndoc ... 3,188 more documents, reached max_nfeat ... 754 more features ]
## LDA topic modelling
n.topics <- 5
dfm2topicmodels <- convert(dfm.trim, to = "topicmodels")
lda.model <- LDA(dfm2topicmodels, n.topics, control = list(seed=111))
lda.model
A LDA_VEM topic model with 5 topics.
# Extract the topic-term matrix (beta) from the LDA model
beta_topics <- tidy(lda.model, matrix = "beta")
beta_topics
# Group the terms by topics
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the group terms
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()

# Top 10 terms for each topics
as.data.frame(terms(lda.model, 10))
# List the topic number for each news article in the corpus.
data.frame(Topic = topics(lda.model))
# Convert the topic model to a data frame
topics_df <- data.frame(topics(lda.model))
# Plot the count of topics
ggplot(topics_df, aes(x = factor(topics(lda.model)))) +
geom_bar(aes(fill = factor(topics(lda.model))), alpha = 0.8) +
scale_fill_discrete(name = "Topic") +
labs(title = "Topic Distribution Based on Latent Dirichlet Allocation (LDA)", x = "Topic", y = "Count") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 12),
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "gray90")) +
geom_text(stat='count', aes(label=..count..), vjust=-0.5)

## Structural Topic Modeling (STM)
n.topics <- 5
dfm2stm <- convert(dfm.trim, to = "stm")
modell.stm <- stm(dfm2stm$documents, dfm2stm$vocab, K = n.topics, data = dfm2stm$meta, init.type = "Spectral")
Beginning Spectral Initialization
Calculating the gram matrix...
Finding anchor words...
.....
Recovering initialization...
.......
Initialization complete.
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 1 (approx. per word bound = -6.371)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 2 (approx. per word bound = -6.316, relative change = 8.618e-03)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 3 (approx. per word bound = -6.296, relative change = 3.166e-03)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 4 (approx. per word bound = -6.288, relative change = 1.287e-03)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 5 (approx. per word bound = -6.284, relative change = 6.390e-04)
Topic 1: court, investig, case, alleg, told
Topic 2: incid, fire, investig, area, servic
Topic 3: govern, met, home, protest, forc
Topic 4: famili, old, murder, told, attack
Topic 5: royal, time, minist, state, famili
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 6 (approx. per word bound = -6.282, relative change = 3.647e-04)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 7 (approx. per word bound = -6.280, relative change = 2.298e-04)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 8 (approx. per word bound = -6.279, relative change = 1.553e-04)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 9 (approx. per word bound = -6.279, relative change = 1.102e-04)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 10 (approx. per word bound = -6.278, relative change = 8.075e-05)
Topic 1: court, investig, case, alleg, told
Topic 2: incid, investig, fire, area, man
Topic 3: met, govern, home, forc, protest
Topic 4: famili, old, told, murder, friend
Topic 5: royal, state, time, minist, servic
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 11 (approx. per word bound = -6.278, relative change = 6.053e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 12 (approx. per word bound = -6.278, relative change = 4.604e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 13 (approx. per word bound = -6.277, relative change = 3.556e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 14 (approx. per word bound = -6.277, relative change = 2.763e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 15 (approx. per word bound = -6.277, relative change = 2.159e-05)
Topic 1: court, investig, case, alleg, charg
Topic 2: incid, investig, fire, area, arrest
Topic 3: met, govern, home, forc, protest
Topic 4: famili, told, old, murder, say
Topic 5: royal, state, minist, time, servic
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 16 (approx. per word bound = -6.277, relative change = 1.699e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 17 (approx. per word bound = -6.277, relative change = 1.340e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Completing Iteration 18 (approx. per word bound = -6.277, relative change = 1.054e-05)
.......................................................................................................
Completed E-Step (1 seconds).
Completed M-Step.
Model Converged
# Top 10 terms for each topic
as.data.frame(t(labelTopics(modell.stm, n = 10)$prob))
# Generate a word cloud for the first topic
par(mar=c(0.5, 0.5, 0.5, 0.5))
cloud(modell.stm, topic = 1, scale = c(2.25,.5))

# Plot a summary of the estimated topic shares for the corpus
plot(modell.stm, type = "summary", text.cex = 0.5, main = "Topic shares on the corpus as a whole", xlab = "estimated share of topics")

# Print the top 5 terms for each of the 5 topics
labelTopics(modell.stm,topics = c(1:5), n=10)
Topic 1 Top Words:
Highest Prob: court, investig, case, alleg, charg, evid, told, inquiri, victim, sexual
FREX: alleg, court, sexual, prosecut, charg, evid, rape, case, sentenc, offenc
Lift: rape, prosecutor, prosecut, sexual, judg, alleg, convict, jail, sentenc, court
Score: rape, court, sexual, investig, alleg, prosecut, guilti, juri, sentenc, convict
Topic 2 Top Words:
Highest Prob: incid, investig, arrest, area, man, fire, london, servic, report, two
FREX: incid, fire, area, road, scene, arrest, hospit, manchest, west, search
Lift: fire, condit, vehicl, incid, road, suspicion, locat, scene, area, injur
Score: fire, incid, investig, road, area, hospit, man, search, scene, resid
Topic 3 Top Words:
Highest Prob: met, govern, home, forc, protest, women, public, report, say, crime
FREX: protest, commission, govern, secretari, organis, chief, violenc, labour, right, crime
Lift: protest, patel, commission, anti, mayor, tackl, institut, improv, polici, cultur
Score: protest, govern, commission, secretari, crime, patel, everard, met, minist, women
Topic 4 Top Words:
Highest Prob: famili, told, old, say, murder, one, go, friend, time, just
FREX: friend, mother, boy, kill, love, know, son, school, dog, girl
Lift: dog, daughter, teenag, parent, didn, friend, sister, boy, son, father
Score: dog, juri, mother, murder, famili, boy, stab, friend, son, love
Topic 5 Top Words:
Highest Prob: royal, state, minist, time, servic, famili, new, one, first, countri
FREX: royal, state, ireland, northern, minist, prime, countri, pay, respect, world
Lift: ireland, royal, northern, prime, pay, respect, state, world, visit, great
Score: ireland, royal, minist, northern, prime, tribut, state, leader, parliament, countri
# Plot histograms of the estimated topic shares within the documents for 3 randomly selected topics
plot(modell.stm, type = "hist", topics = sample(1:n.topics, size = 3), main = "histogram of the topic shares within the documents")

# Plot the top terms for each of the 5 topics
plot(modell.stm, type = "labels", topics = c(1,2,3,4,5), main = "Topic terms")

library(ggplot2)
library(dplyr)
# Identify the most probable topic for each document
top_topics <- apply(modell.stm$theta, 1, which.max)
# Add the topic labels to the original data frame
my_data_hbd$Topic_stm <- top_topics
# Calculate the count of each topic
df_topic_counts <- my_data_hbd %>%
group_by(Topic_stm) %>%
summarize(count = n())
# Create the plot
ggplot(df_topic_counts, aes(x = factor(Topic_stm), y = count, fill = factor(Topic_stm))) +
geom_bar(stat = "identity", alpha = 0.8) +
scale_fill_discrete(name = "Topic") +
labs(title = "Topic Distribution Based on Structural Topic Modeling (STM)", x = "Topic", y = "Count") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 12),
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "gray90")) +
geom_text(aes(label = count), vjust=-0.5)

## To see how the topics changed over time (LDA model)
# Add the topic labels to the original data frame
my_data_hbd$Topic_lda <- topics(lda.model)
# Convert the date column to a Date object
my_data_hbd$first_publication_date <- as.Date(my_data_hbd$first_publication_date, format = "%Y-%m-%d")
# Calculate the proportion of each topic by year
topic_proportions <- my_data_hbd %>%
group_by(Topic_lda, year = lubridate::year(first_publication_date)) %>%
summarise(count = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(year) %>%
mutate(proportion = count / sum(count))
# Plot the topic proportions over time
ggplot(topic_proportions, aes(x = year, y = proportion, color = factor(Topic_lda))) +
geom_line() +
labs(x = "Year", y = "Proportion", color = "Topic") +
theme_bw()

# Calculate the proportion of each topic by half-year
topic_proportions_half_year <- my_data_hbd %>%
group_by(Topic_lda, year = lubridate::year(first_publication_date), half_year = lubridate::floor_date(first_publication_date, unit = "6 months")) %>%
summarise(count = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(year, half_year) %>%
mutate(proportion = count / sum(count))
# Plot the topic proportions over time
ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion", color = "Topic") +
theme_bw()

## To see how the topics changed over time (STM model)
# Identify the most probable topic for each document
top_topics <- apply(modell.stm$theta, 1, which.max)
# Add the topic labels to the original data frame
my_data_hbd$Topic_stm <- top_topics
# Calculate the proportion of each topic by year
topic_proportions_stm <- my_data_hbd %>%
group_by(Topic_stm, year = lubridate::year(first_publication_date)) %>%
summarise(count = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(year) %>%
mutate(proportion = count / sum(count))
# Plot the topic proportions over time
ggplot(topic_proportions_stm, aes(x = year, y = proportion, color = factor(Topic_stm))) +
geom_line() +
labs(x = "Year", y = "Proportion", color = "Topic") +
theme_bw()

# Calculate the proportion of each topic by half-year
topic_proportions_stm_half_year <- my_data_hbd %>%
group_by(Topic_stm, year = lubridate::year(first_publication_date), half_year = lubridate::floor_date(first_publication_date, unit = "6 months")) %>%
summarise(count = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(year, half_year) %>%
mutate(proportion = count / sum(count))
# Plot the topic proportions over time
ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion", color = "Topic") +
theme_bw()

# Refine the graphs, add the top 5 terms for each topic to the legend (LDA model)
# Extract the top 5 terms for each topic
top_terms <- terms(lda.model, 5)
# Create label for color legend
legend_label <- paste("Top 5 Terms\n",
"1: ", paste(top_terms[,1], collapse = ", "), "\n",
"2: ", paste(top_terms[,2], collapse = ", "), "\n",
"3: ", paste(top_terms[,3], collapse = ", "), "\n",
"4: ", paste(top_terms[,4], collapse = ", "), "\n",
"5: ", paste(top_terms[,5], collapse = ", "), sep = "")
# Plot the topic proportions over time
ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion",
color = legend_label,
subtitle = "Topic Changes Over Time Using LDA Topic Modelling") +
theme_bw()

# Refine the graphs, add the top 5 terms for each topic to the legend (STM model)
# Extract the top 5 terms for each topic
top_terms_stm <- labelTopics(modell.stm, topics = c(1:5), n=5)
top_terms_stm$frex
[,1] [,2] [,3] [,4] [,5]
[1,] "alleg" "court" "sexual" "prosecut" "charg"
[2,] "incid" "fire" "area" "road" "scene"
[3,] "protest" "commission" "govern" "secretari" "organis"
[4,] "friend" "mother" "boy" "kill" "love"
[5,] "royal" "state" "ireland" "northern" "minist"
# Create label for color legend
legend_label_stm <- paste("Top 5 Terms\n",
"1: ", paste(top_terms_stm$frex[1,], collapse = ", "), "\n",
"2: ", paste(top_terms_stm$frex[2,], collapse = ", "), "\n",
"3: ", paste(top_terms_stm$frex[3,], collapse = ", "), "\n",
"4: ", paste(top_terms_stm$frex[4,], collapse = ", "), "\n",
"5: ", paste(top_terms_stm$frex[5,], collapse = ", "), sep = "")
# Plot the topic proportions over time
ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion",
color = legend_label_stm,
subtitle = "Topic Changes Over Time Using Structural Topic Modeling (STM)") +
theme_bw()

# Try to combine sentiment and topic changes over time
library(patchwork)
# Create a line chart of sentiment score over time
p1 <- ggplot(sentiment_by_date, aes(x = first_publication_date, y = sentiment_score)) +
geom_line() +
geom_hline(yintercept = 0, linetype = "dashed", color = "blue") +
labs(title = "Sentiment Score over Time Based on Syuzhet Package", x = "Date", y = "Sentiment Score") +
theme_minimal()+
theme(plot.title = element_text(size = 11))
# Plot the topic proportions over time using LDA
p2 <- ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion",
color = "Topic", subtitle = "Topic Changes Over Time Using LDA Topic Modelling") +
theme_bw()
# Plot the topic proportions over time using STM
p3 <- ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
geom_line() +
labs(x = "Half-Year", y = "Proportion",
color = "Topic",
subtitle = "Topic Changes Over Time Using Structural Topic Modeling") +
theme_bw()
# Combine the three plots vertically
p1 + p2 + p3 + plot_layout(ncol = 1)

---
title: "R Notebook"
output: html_notebook
---
```{r}
# Connect to Guardian API
library(guardianapi)
gu_api_key()
```

```{r}
# Get data include key words "police" OR "law enforcement"
data_all <-gu_content(query='"police" OR "law enforcement"',
        from_date="2021-01-01",
        to_date="2023-03-31")
```
```{r}
# Check the data
dim(data_all)
head(data_all)
# 24863 news articles collected
```

```{r}
# Limit the data to only section_name == 'UK news'
data_uk_sec <- subset(data_all, section_name == 'UK news')
dim(data_uk_sec)
head(data_uk_sec) 
```


```{r}
# Save as CSV file
library(dplyr)
my_data <- data_uk_sec %>% 
  select(-tags) # remove the non-informative column that contains nested lists
my_df <- as.data.frame(my_data)
write.csv(my_df, "raw_data.csv", row.names = TRUE)
```

```{r}
# Read collected data
my_data <- read.csv('raw_data.csv')
dim(my_data)# 3259 UK news articles 
```

```{r}
# Load required packages
library(tidyr)
library(quanteda)
library(caret)
library(dplyr)
library(tidytext)
library(syuzhet)
library(tidyverse)
library(quanteda.textstats)
library(ggplot2)
```

```{r}
# Pre-processing
# Select required columns
my_data_hbd <- my_data %>% 
  select(headline, body_text, first_publication_date)
# Remove duplicates
my_data_hbd   <- distinct(my_data_hbd)
# Remove noise
my_data_hbd$body_text <- gsub("\\W+", " ", my_data_hbd$body_text)
# Tokenise the data and remove punctuation, numbers, and symbols
# Convert all text to lowercase and apply stemming
my_data_tokens <- tokens(my_data_hbd$body_text, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords("english")) %>% tokens_tolower() %>% tokens_wordstem(language = quanteda_options("language_stemmer")) 
```

```{r}
dim(my_data_hbd)# 3194 UK news articles after pre-processing
head(my_data_hbd)
```


```{r}
## Bing lexicon
# Try to use "bing" lexicon to get sentiment score for each news article
# Convert the data to a tidy format
tokens <- my_data_hbd %>%
  unnest_tokens(word, body_text) %>%
  anti_join(stop_words)
# Load the lexicon
lexicon <- get_sentiments("bing")

# Join the lexicon with data
my_data_sentiment <- tokens %>%
  inner_join(lexicon)

# Calculate the sentiment score
my_data_sentiment_scores <- my_data_sentiment %>%
  count(headline, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment_score = positive - negative)

# Visualise the sentiment scores
ggplot(my_data_sentiment_scores, aes(x = sentiment_score)) +
  geom_histogram(binwidth = 2, color = "black", fill = "lightblue") +
  labs(title = "Distribution of Sentiment Scores",
       x = "Sentiment Score",
       y = "Frequency") +
  theme_minimal()

# Categorise sentiment scores as positive, negative, or neutral
my_data_sentiment_category <- my_data_sentiment_scores %>%
  mutate(sentiment_category = ifelse(sentiment_score > 0, "Positive",
                                     ifelse(sentiment_score < 0, "Negative", "Neutral")))
  
# Count the number of articles in each sentiment category
my_data_sentiment_counts <- my_data_sentiment_category %>%
  count(sentiment_category)
print(my_data_sentiment_counts)
```

```{r}
# Visualise the distribution of sentiment categories in articles
ggplot(data = my_data_sentiment_counts, aes(x = sentiment_category, y = n, fill = sentiment_category)) +
  geom_col() +
  geom_text(aes(label = n), position = position_stack(vjust = 0.5), color = "black", size = 4) +
  labs(x = "Sentiment Category", y = "Number of Articles", fill = "Sentiment Category") +
  ggtitle("Sentiment Categories Proportions Based on Bing Lexicon")

```

```{r}
# Try to explore sentiment change over time based on Bing lexicon
# Calculate the sentiment score by date
sentiment_scores_date <- my_data_sentiment %>%
  count(first_publication_date, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment_score = positive - negative)

# Categorise sentiment scores as positive, negative, or neutral
sentiment_category_date <- sentiment_scores_date %>%
  mutate(sentiment_category = ifelse(sentiment_score > 0, "Positive",
                                     ifelse(sentiment_score < 0, "Negative", "Neutral")))

# Convert date to date object
sentiment_category_date$first_publication_date <- as.Date(sentiment_category_date$first_publication_date)

# Visualise the sentiment scores over time
ggplot(sentiment_category_date, aes(x = first_publication_date, y = sentiment_score)) +
  geom_line(color = "#000080") +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Sentiment Score Over Time Based on Bing Lexicon",
       x = "Date",
       y = "Sentiment Score") +
  theme_minimal()
```


```{r}
## Use get_sentiment() from syuzhet package to get sentiment score for each news article
library(syuzhet)
# Calculate the sentiment scores
my_data_hbd$sentiment_score <- get_sentiment(my_data_hbd$body_text)

# Categorise sentiment scores as positive, negative, or neutral
my_data_hbd$sentiment_category <- cut(my_data_hbd$sentiment_score, breaks=c(-Inf,-0.1,0.1,Inf), labels=c("Negative", "Neutral", "Positive"))

# Visualise the distribution of sentiment scores
ggplot(my_data_hbd, aes(x=sentiment_score, fill=sentiment_category)) +
  geom_histogram(binwidth=0.05, alpha=0.5, position="identity") +
  labs(title="Sentiment Scores Distribution", x="Sentiment Score", y="Count") +
  theme_minimal()
```

```{r}
# Visualise the proportion of positive, negative, and neutral sentiment categories
ggplot(my_data_hbd, aes(x = sentiment_category, fill = sentiment_category)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), 
            position = position_stack(vjust = 0.5)) +
  labs(title = "Sentiment Categories Proportions Based on Syuzhet Package",
       x = "Sentiment Category", y = "Number of Articles") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "grey90", color = NA),
        panel.grid.major = element_line(color = "white"),
        panel.grid.minor = element_blank())

```


```{r}
# Try to explore sentiment change over time based on syuzhet package
# Convert first_publication_date to a date object
my_data_hbd$first_publication_date <- as.Date(my_data_hbd$first_publication_date)

# Aggregate sentiment score by date
sentiment_by_date <- my_data_hbd %>%
  group_by(first_publication_date) %>%
  summarise(sentiment_score = mean(sentiment_score))

# Create a line chart of sentiment score over time
ggplot(sentiment_by_date, aes(x=first_publication_date, y=sentiment_score)) +
  geom_line(color = "#000080") +
  geom_hline(yintercept = 0, color = "red") +
  labs(title="Sentiment Score over Time Based on Syuzhet Package", x="Date", y="Sentiment Score") +
  theme_minimal()
```


```{r}
# Plot the sentiment score over time together (Bing Lexicon and Syuzhet Package)
library(patchwork)
library(scales)

# Create the first plot
plota <- ggplot(sentiment_category_date, aes(x = first_publication_date, y = sentiment_score)) +
  geom_line(color = "#000080") +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Sentiment Score Over Time Based on Bing Lexicon",
       x = "Date",
       y = "Sentiment Score") +
  theme_minimal() +
  scale_x_date(date_breaks = "2 months", date_labels = "%b %y")

# Create the second plot
plotb <- ggplot(sentiment_by_date, aes(x=first_publication_date, y=sentiment_score)) +
  geom_line(color = "#000080") +
  geom_hline(yintercept = 0, color = "red") +
  labs(title="Sentiment Score over Time Based on Syuzhet Package", x="Date", y="Sentiment Score") +
  theme_minimal() +
  scale_x_date(date_breaks = "2 months", date_labels = "%b %y")

# Combine the plots vertically using patchwork
plota / plotb
```


```{r}
## Supervised Machine Learning
# Try to apply machine learning technique, build a model to predict sentiment
# Map the values of the original column "sentiment_category" to integers
my_data_ml <- my_data_hbd %>%
  mutate(id = row_number(), # add a new column with row numbers
         sentiment_category = case_when(
           sentiment_category == "Positive" ~ 2,
           sentiment_category == "Negative" ~ 1,
           sentiment_category == "Neutral" ~ 0,
           TRUE ~ NA_real_ # to handle any other cases
         )) 
```

```{r}
# Extract unigrams, bigrams as features
unigrams <- my_data_ml$body_text %>%
          tokens(remove_punct = T) %>%
          tokens_ngrams(1) %>%
          dfm()
bigrams <-  my_data_ml$body_text %>%
          tokens(remove_punct = T) %>%
          tokens_ngrams(2) %>%
          dfm()
# Apply sparsity correction to reduce the zero counts
unigrams_corrected <- dfm_trim(unigrams, sparsity = .90)
bigrams_corrected <- dfm_trim(bigrams, sparsity = .90)

# Check dimensions
dim(unigrams_corrected)

# TFIDF-weighted ngrams
unigrams_tfidf <- dfm_tfidf(unigrams_corrected,
                           scheme_tf = 'count',
                           scheme_df = 'inverse')
bigrams_tfidf <- dfm_tfidf(bigrams_corrected,
                          scheme_tf = 'count',
                          scheme_df = 'inverse')

# Convert ngrams to data frames
news_unigrams <- convert(unigrams_tfidf, 'data.frame')
news_bigrams <- convert(bigrams_tfidf, 'data.frame')

# Add id column to ngrams to join unigrams and bigrams later
news_unigrams$id <- my_data_ml$id
news_bigrams$id <- my_data_ml$id

# Remove doc_id column
unigrams_tfidf_df <- subset(news_unigrams, select = -c(doc_id))
bigrams_tfidf_df <- subset(news_bigrams, select = -c(doc_id))

# Join the two datasets
ml_data <- bigrams_tfidf_df %>% left_join(unigrams_tfidf_df, by = "id")
```

```{r}
# Get sentiment scores as features
ml_data$sentiment_score <- get_sentiment(my_data_ml$body_text)
# Add label column
ml_data$label <- my_data_ml$sentiment_category
# Remove the id column
ml_data <- ml_data %>% select(-id)
# Map labels
ml_data$label <- ifelse(ml_data$label == "2", "Positive", 
                   ifelse(ml_data$label == "1", "Negative", "Neutral")) 
# Convert label to factor
ml_data$label <- as.factor(ml_data$label) # convert label to factor
```

```{r}
# Apply machine learning setting
library(caret)
set.seed(123)
for_training <- createDataPartition(y = ml_data$label,
                                  p = .8,
                                  list = FALSE)

training_data <- ml_data[for_training,]
testing_data <- ml_data[-for_training,]
```


```{r}
# Set training control parameters
training_controls <- trainControl(method="cv", 
                                 number = 5,
                                 classProbs = T)
```

```{r}
## Train the SVM model
model.svm <- train(label ~ .,
                  data = training_data,
                  method = "svmLinear",
                  trControl = training_controls,
                  preProcess = c('zv')
                 )
```


```{r}
# Predict on the testing data
pred_svm <- predict(model.svm, testing_data)

# Evaluate the model
confusionMatrix(pred_svm, as.factor(testing_data$label), mode="prec_recall")
```


```{r}
## Train the Naive Bayes model
model.nb <- train(label ~ .,
                  data = training_data,
                  method = "naive_bayes",
                  trControl = training_controls,
                  preProcess = c('zv'))
```


```{r}
# Predict on the testing data
pred_nb <- predict(model.nb, testing_data)

# Evaluate the model
confusionMatrix(pred_nb, as.factor(testing_data$label), mode="prec_recall")
```

```{r}
# Set training control parameters
training_control <- trainControl(method = "cv", 
                                 number = 5, 
                                 verboseIter = FALSE)
```

```{r}
# Train the Neural Network (NN) model
model.nn <- train(label ~ .,
                  data = training_data,
                  trControl = training_control,
                  method = "nnet",
                  MaxNWts = 10000,
                  maxit = 10)
```

```{r}
# Predict on the testing data
pred_nn <- predict(model.nn, testing_data)
# Evaluate the model
confusionMatrix(pred_nn, testing_data$label, mode = "prec_recall")
```


```{r}
## Unsupervised Machine Learning
# Try to apply k-means clustering model using the extracted features
# Use ngrams frequencies as features
ml_grams <- bigrams_tfidf_df %>% left_join(unigrams_tfidf_df, by = "id")
# Remove the id column
ml_grams <- ml_grams %>% select(-id)
```

```{r}
# Use the k-means model on n-grams
# Determine k
wss <- numeric()
for(i in 1:10){
  ngrams_kmeans_temp <- kmeans(x = ml_grams,
                             centers = i,
                             iter.max = 20,
                             nstart = 10)
  wss[i] <- ngrams_kmeans_temp$tot.withinss
}
{plot(wss, type='b')} 

```

```{r}
# Settle for k=2
ngrams_kmeans <- kmeans(ml_grams,
                      centers = 2,
                      iter.max = 10,
                      nstart = 10)
```

```{r}
# Look at the clusters by their most prominent ngram differences
# Cluster 1
cat("Cluster 1: \n")
print(head(sort(ngrams_kmeans$centers[1, ], decreasing = TRUE), n = 20))
# Cluster 2
cat("\nCluster 2: \n")
print(head(sort(ngrams_kmeans$centers[2, ], decreasing = TRUE), n = 20))
```

```{r}
# Use the k-means model on sentiment score
# Determine k
wss <- numeric()
for(i in 1:10){
  sentiment_kmeans_temp <- kmeans(x = ml_data$sentiment_score,
                             centers = i,
                             iter.max = 10,
                             nstart = 10)
  wss[i] <- sentiment_kmeans_temp$tot.withinss
}
{plot(wss, type='b')}
```


```{r}
# Settle for k=3
sentiment_kmeans <- kmeans(ml_data$sentiment_score,
                      centers = 3,
                      iter.max = 5,
                      nstart = 5)
```


```{r}
# Print the centers of each sentiment cluster
print(sentiment_kmeans$centers[1, ])
print(sentiment_kmeans$centers[2, ])
print(sentiment_kmeans$centers[3, ])
```

```{r}
# Visualise the clusters
# Add the cluster labels to the original data frame
ml_data$cluster <- factor(sentiment_kmeans$cluster)

# Create the boxplot
ggplot(ml_data, aes(x = cluster, y = sentiment_score)) +
  geom_boxplot() +
  xlab("Cluster") +
  ylab("Sentiment Score")

```


```{r}
### Topic Modelling
# Load required packages
library(tidyr)
library(quanteda)
library(caret)
library(dplyr)
library(tidytext)
library(topicmodels)
library(ldatuning)
library(stm)
library(wordcloud)
library(tm)
library(ggplot2)
```

```{r}
# Prepare the data and select required columns
my_data_hbd <- my_data %>% 
  select(headline, body_text, first_publication_date)
# Remove duplicates
my_data_hbd <- distinct(my_data_hbd)
# Remove any noise in the data
my_data_hbd$body_text <- gsub("\\W+", " ", my_data_hbd$body_text)
```


```{r}
# Tokenise the data and remove punctuation, numbers, and symbols
# Convert all text to lowercase, apply stemming, and remove non-informative words for topic modelling purposes
tm_tokens <- tokens(my_data_hbd$body_text, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords("english")) %>% tokens_tolower() %>% tokens_wordstem(language = quanteda_options("language_stemmer")) %>% tokens_remove(c("polic", "offic", "peopl", "year", "day", "will", "also", "t", "s", "can", "uk", "british"))
```

```{r}
# Create a Document Feature Matrix (dfm)
dfm_tm <- dfm(tm_tokens)

# Trim the dfm for frequently and rarely occurring terms
dfm.trim <- dfm_trim(dfm_tm, min_docfreq = 0.075, max_docfreq = 0.9, docfreq_type = "prop")
dfm.trim
```

```{r}
## LDA topic modelling
n.topics <- 5
dfm2topicmodels <- convert(dfm.trim, to = "topicmodels")
lda.model <- LDA(dfm2topicmodels, n.topics, control = list(seed=111))
lda.model
```

```{r}
# Extract the topic-term matrix (beta) from the LDA model
beta_topics <- tidy(lda.model, matrix = "beta")
beta_topics
```

```{r}
# Group the terms by topics
beta_top_terms <- beta_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  arrange(topic, -beta)
```

```{r}
# Display the group terms 
beta_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()
```

```{r}
# Top 10 terms for each topics 
as.data.frame(terms(lda.model, 10))

# List the topic number for each news article in the corpus.
data.frame(Topic = topics(lda.model))
```

```{r}
# Convert the topic model to a data frame
topics_df <- data.frame(topics(lda.model))

# Plot the count of topics
ggplot(topics_df, aes(x = factor(topics(lda.model)))) +
  geom_bar(aes(fill = factor(topics(lda.model))), alpha = 0.8) +
  scale_fill_discrete(name = "Topic") +
  labs(title = "Topic Distribution Based on Latent Dirichlet Allocation (LDA)", x = "Topic", y = "Count") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 12),
        panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90")) +
  geom_text(stat='count', aes(label=..count..), vjust=-0.5)

```



```{r}
## Structural Topic Modeling (STM)
n.topics <- 5
dfm2stm <- convert(dfm.trim, to = "stm")
modell.stm <- stm(dfm2stm$documents, dfm2stm$vocab, K = n.topics, data = dfm2stm$meta, init.type = "Spectral")
```

```{r}
# Top 10 terms for each topic
as.data.frame(t(labelTopics(modell.stm, n = 10)$prob))

# Generate a word cloud for the first topic
par(mar=c(0.5, 0.5, 0.5, 0.5))
cloud(modell.stm, topic = 1, scale = c(2.25,.5))

# Plot a summary of the estimated topic shares for the corpus
plot(modell.stm, type = "summary", text.cex = 0.5, main = "Topic shares on the corpus as a whole", xlab = "estimated share of topics")
```


```{r}
# Print the top 5 terms for each of the 5 topics
labelTopics(modell.stm,topics = c(1:5), n=10)

# Plot histograms of the estimated topic shares within the documents for 3 randomly selected topics
plot(modell.stm, type = "hist", topics = sample(1:n.topics, size = 3), main = "histogram of the topic shares within the documents")

# Plot the top terms for each of the 5 topics
plot(modell.stm, type = "labels", topics = c(1,2,3,4,5), main = "Topic terms")
```

```{r}
library(ggplot2)
library(dplyr)
# Identify the most probable topic for each document
top_topics <- apply(modell.stm$theta, 1, which.max)
# Add the topic labels to the original data frame
my_data_hbd$Topic_stm <- top_topics
# Calculate the count of each topic
df_topic_counts <- my_data_hbd %>%
  group_by(Topic_stm) %>%
  summarize(count = n())

# Create the plot
ggplot(df_topic_counts, aes(x = factor(Topic_stm), y = count, fill = factor(Topic_stm))) +
  geom_bar(stat = "identity", alpha = 0.8) +
  scale_fill_discrete(name = "Topic") +
  labs(title = "Topic Distribution Based on Structural Topic Modeling (STM)", x = "Topic", y = "Count") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 12),
        panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90")) +
  geom_text(aes(label = count), vjust=-0.5)

```



```{r}
## To see how the topics changed over time (LDA model)

# Add the topic labels to the original data frame
my_data_hbd$Topic_lda <- topics(lda.model)

# Convert the date column to a Date object
my_data_hbd$first_publication_date <- as.Date(my_data_hbd$first_publication_date, format = "%Y-%m-%d")

# Calculate the proportion of each topic by year
topic_proportions <- my_data_hbd %>%
  group_by(Topic_lda, year = lubridate::year(first_publication_date)) %>%
  summarise(count = n(), .groups = 'drop') %>%
  ungroup() %>%
  group_by(year) %>%
  mutate(proportion = count / sum(count))

# Plot the topic proportions over time
ggplot(topic_proportions, aes(x = year, y = proportion, color = factor(Topic_lda))) +
  geom_line() +
  labs(x = "Year", y = "Proportion", color = "Topic") +
  theme_bw()

# Calculate the proportion of each topic by half-year
topic_proportions_half_year <- my_data_hbd %>%
  group_by(Topic_lda, year = lubridate::year(first_publication_date), half_year = lubridate::floor_date(first_publication_date, unit = "6 months")) %>%
  summarise(count = n(), .groups = 'drop') %>%
  ungroup() %>%
  group_by(year, half_year) %>%
  mutate(proportion = count / sum(count))

# Plot the topic proportions over time
ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", color = "Topic") +
  theme_bw()
```



```{r}
## To see how the topics changed over time (STM model)

# Identify the most probable topic for each document
top_topics <- apply(modell.stm$theta, 1, which.max)
# Add the topic labels to the original data frame
my_data_hbd$Topic_stm <- top_topics

# Calculate the proportion of each topic by year
topic_proportions_stm <- my_data_hbd %>%
  group_by(Topic_stm, year = lubridate::year(first_publication_date)) %>%
  summarise(count = n(), .groups = 'drop') %>%
  ungroup() %>%
  group_by(year) %>%
  mutate(proportion = count / sum(count))

# Plot the topic proportions over time
ggplot(topic_proportions_stm, aes(x = year, y = proportion, color = factor(Topic_stm))) +
  geom_line() +
  labs(x = "Year", y = "Proportion", color = "Topic") +
  theme_bw()

# Calculate the proportion of each topic by half-year
topic_proportions_stm_half_year <- my_data_hbd %>%
  group_by(Topic_stm, year = lubridate::year(first_publication_date), half_year = lubridate::floor_date(first_publication_date, unit = "6 months")) %>%
  summarise(count = n(), .groups = 'drop') %>%
  ungroup() %>%
  group_by(year, half_year) %>%
  mutate(proportion = count / sum(count))


# Plot the topic proportions over time
ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", color = "Topic") +
  theme_bw()
```

```{r}
# Refine the graphs, add the top 5 terms for each topic to the legend (LDA model)

# Extract the top 5 terms for each topic
top_terms <- terms(lda.model, 5)

# Create label for color legend
legend_label <- paste("Top 5 Terms\n",
                      "1: ", paste(top_terms[,1], collapse = ", "), "\n",
                      "2: ", paste(top_terms[,2], collapse = ", "), "\n",
                      "3: ", paste(top_terms[,3], collapse = ", "), "\n",
                      "4: ", paste(top_terms[,4], collapse = ", "), "\n",
                      "5: ", paste(top_terms[,5], collapse = ", "), sep = "")

# Plot the topic proportions over time
ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", 
       color = legend_label,
       subtitle = "Topic Changes Over Time Using LDA Topic Modelling") +
  theme_bw()

```


```{r}
# Refine the graphs, add the top 5 terms for each topic to the legend (STM model)

# Extract the top 5 terms for each topic
top_terms_stm <- labelTopics(modell.stm, topics = c(1:5), n=5)
top_terms_stm$frex

# Create label for color legend
legend_label_stm <- paste("Top 5 Terms\n",
                      "1: ", paste(top_terms_stm$frex[1,], collapse = ", "), "\n",
                      "2: ", paste(top_terms_stm$frex[2,], collapse = ", "), "\n",
                      "3: ", paste(top_terms_stm$frex[3,], collapse = ", "), "\n",
                      "4: ", paste(top_terms_stm$frex[4,], collapse = ", "), "\n",
                      "5: ", paste(top_terms_stm$frex[5,], collapse = ", "), sep = "")

# Plot the topic proportions over time
ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", 
       color = legend_label_stm,
       subtitle = "Topic Changes Over Time Using Structural Topic Modeling (STM)") +
  theme_bw()

```

```{r}
# Try to combine sentiment and topic changes over time
library(patchwork)

# Create a line chart of sentiment score over time
p1 <- ggplot(sentiment_by_date, aes(x = first_publication_date, y = sentiment_score)) +
  geom_line() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "blue") +
  labs(title = "Sentiment Score over Time Based on Syuzhet Package", x = "Date", y = "Sentiment Score") +
  theme_minimal()+
  theme(plot.title = element_text(size = 11))

# Plot the topic proportions over time using LDA
p2 <- ggplot(topic_proportions_half_year, aes(x = half_year, y = proportion, color = factor(Topic_lda))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", 
       color = "Topic", subtitle = "Topic Changes Over Time Using LDA Topic Modelling") +
  theme_bw()

# Plot the topic proportions over time using STM
p3 <- ggplot(topic_proportions_stm_half_year, aes(x = half_year, y = proportion, color = factor(Topic_stm))) +
  geom_line() +
  labs(x = "Half-Year", y = "Proportion", 
       color = "Topic",
       subtitle = "Topic Changes Over Time Using Structural Topic Modeling") +
  theme_bw()

# Combine the three plots vertically
p1 + p2 + p3 + plot_layout(ncol = 1)

```







